#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-07-09 07:33:14
# Project: papa
import re
from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://www.cs.tsinghua.edu.cn/publish/cs/4797/index.html', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match('http://www.cs.tsinghua.edu.cn/publish/cs/4616/', each.attr.href, re.U):
                self.crawl(each.attr.href, callback=self.detail_page)

    @config(priority=2)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('html > body > .auto > .content > table tr > .box_right > #s2_right_con > p').text()
        }
